import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
from statsmodels.datasets import get_rdataset
import sklearn.model_selection as skm
from ISLP import load_data , confusion_table
from ISLP.models import ModelSpec as MS
1. imports
from sklearn.tree import (DecisionTreeClassifier as DTC ,
as DTR, plot_tree, export_text)
DecisionTreeRegressor from sklearn.metrics import (accuracy_score, log_loss)
from sklearn.ensemble import (RandomForestRegressor as RF, GradientBoostingRegressor as GBR)
from ISLP.bart import BART
2. Data
= load_data('Carseats')
Carseats = np.where(Carseats.Sales > 8, "Yes", "No") High
3. 단순 나무 모형 적용
= MS(Carseats.columns.drop('Sales'), intercept=False)
model = model.fit_transform(Carseats)
D = list(D.columns)
feature_names = np.asarray(D)
X
= DTC(criterion='entropy', max_depth=3, random_state=0)
clf # 깊이는 3으로, 손실함수는 엔트로피 이용
clf.fit(X, High)
DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
-
분류 문제에서 예측오차, 손실함수의 값, 의사결정 나무 확인
accuracy_score(High , clf.predict(X))
0.79
-
Residual deviance: $ -2 m k n{mk} {mk} $
= np.sum(log_loss(High , clf.predict_proba(X)))
resid_dev resid_dev
0.4710647062649358
= subplots(figsize=(12,12))[1]
ax =feature_names, ax=ax);
plot_tree(clf, feature_namesprint(export_text(clf, feature_names=feature_names, show_weights=True))
# weights: 0과 1의 비중
|--- ShelveLoc[Good] <= 0.50
| |--- Price <= 92.50
| | |--- Income <= 57.00
| | | |--- weights: [7.00, 3.00] class: No
| | |--- Income > 57.00
| | | |--- weights: [7.00, 29.00] class: Yes
| |--- Price > 92.50
| | |--- Advertising <= 13.50
| | | |--- weights: [183.00, 41.00] class: No
| | |--- Advertising > 13.50
| | | |--- weights: [20.00, 25.00] class: Yes
|--- ShelveLoc[Good] > 0.50
| |--- Price <= 135.00
| | |--- US[Yes] <= 0.50
| | | |--- weights: [6.00, 11.00] class: Yes
| | |--- US[Yes] > 0.50
| | | |--- weights: [2.00, 49.00] class: Yes
| |--- Price > 135.00
| | |--- Income <= 46.00
| | | |--- weights: [6.00, 0.00] class: No
| | |--- Income > 46.00
| | | |--- weights: [5.00, 6.00] class: Yes
-
Validation error 확인
= skm.ShuffleSplit(n_splits=1, test_size=200, random_state=0)
validation = skm.cross_validate(clf, D, High, cv=validation)
results 'test_score'] results[
array([0.685])
-
두 개의 데이터셋으로 쪼개서 훈련과 평가 진행
= skm.train_test_split(X, High, test_size=0.5, random_state=0)
(X_train, X_test, High_train, High_test) = DTC(criterion='entropy', random_state=0)
clf
clf.fit(X_train, High_train) accuracy_score(High_test, clf.predict(X_test))
0.735
-
훈련 데이터에서 CV로 최적의 pruning 을 선택
= clf.cost_complexity_pruning_path(X_train, High_train)
ccp_path = skm.KFold(10, random_state=1, shuffle=True)
kfold = skm.GridSearchCV(clf, {'ccp_alpha': ccp_path.ccp_alphas}, refit=True,cv=kfold,
grid ='accuracy')
scoring
grid.fit(X_train, High_train) grid.best_score_
0.685
-
CV로 선택된 최적의 의사결정나무를 그림
= subplots(figsize=(12, 12))[1]
ax = grid.best_estimator_
best_ =feature_names, ax=ax);
plot_tree(best_, feature_names best_.tree_.n_leaves
30
-
훈련 데이터에서 모형을 구축하고 평가 데이터에서 혼동행렬을 봄
print(accuracy_score(High_test, best_.predict(X_test)))
= confusion_table(best_.predict(X_test), High_test)
confusion print(confusion)
0.72
Truth No Yes
Predicted
No 94 32
Yes 24 50
-
보스턴 데이터 로딩 및 의사결정나무(회귀분석) 분석 진행
= load_data("Boston")
Boston = MS(Boston.columns.drop('medv'), intercept=False)
model = model.fit_transform(Boston)
D = list(D.columns)
feature_names = np.asarray(D) X
= skm.train_test_split(X, Boston['medv'],test_size=0.3, random_state=0)
(X_train, X_test, y_train, y_test) = DTR(max_depth=3)
reg
reg.fit(X_train , y_train)= subplots(figsize=(12,12))[1]
ax =feature_names, ax=ax); plot_tree(reg, feature_names
-
CV를 통한 최적의 pruning을 진행
= reg.cost_complexity_pruning_path(X_train , y_train)
ccp_path = skm.KFold(5, shuffle=True, random_state =10)
kfold = skm.GridSearchCV(reg, {'ccp_alpha': ccp_path.ccp_alphas}, refit=True, cv=kfold,
grid ='neg_mean_squared_error')
scoring= grid.fit(X_train, y_train)
G = grid.best_estimator_
best_ - best_.predict(X_test))**2) np.mean((y_test
28.06985754975404
= subplots(figsize=(12,12))[1]
ax =feature_names, ax=ax); plot_tree(G.best_estimator_, feature_names
4. RF(Random Forests)
-
보스턴 데이터에 대한 랜덤 포레스트 진행
= RF(max_features=X_train.shape[1], random_state=0)
bag_boston bag_boston.fit(X_train , y_train)
RandomForestRegressor(max_features=12, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(max_features=12, random_state=0)
-
예측값과 산점도를 통해서 정확도를 확인, 예측오차를 계산
= subplots(figsize=(8,8))[1]
ax = bag_boston.predict(X_test)
y_hat_bag
ax.scatter(y_hat_bag , y_test)- y_hat_bag)**2) np.mean((y_test
14.634700151315787
-
랜덤 포레스트에서 사용하는 변수의 개수를 조정
= RF(max_features=X_train.shape[1],
bag_boston =500, random_state=0).fit(X_train , y_train)
n_estimators= bag_boston.predict(X_test)
y_hat_bag print(np.mean((y_test - y_hat_bag)**2))
# usinbg sqrt(p) variables
= RF(max_features=6, random_state=0).fit(X_train , y_train)
RF_boston = RF_boston.predict(X_test)
y_hat_RF print(np.mean((y_test - y_hat_RF)**2))
14.605662565263161
20.04276446710527
-
변수의 중요도를 의미하는 importance를 확인
= pd.DataFrame( {'importance':RF_boston.feature_importances_}, index=feature_names)
feature_imp print(feature_imp.sort_values(by='importance', ascending=False))
importance
lstat 0.356203
rm 0.332163
ptratio 0.067270
crim 0.055404
indus 0.053851
dis 0.041582
nox 0.035225
tax 0.025355
age 0.021506
rad 0.004784
chas 0.004203
zn 0.002454
-
그래디언트 부스팅을 진행
- 동원된 의사결정나무는 5000개, 최대 깊이는 3, 학습률은 0.001
= GBR(n_estimators=50000, learning_rate =0.001, max_depth=3,
boost_boston =0)
random_state
boost_boston.fit(X_train , y_train)
= np.zeros_like(boost_boston.train_score_)
test_error for idx , y_ in enumerate(boost_boston.staged_predict(X_test)):
= np.mean((y_test - y_)**2)
test_error[idx] = np.arange(boost_boston.train_score_.shape[0])
plot_idx = subplots(figsize=(8,8))[1]
ax 'b', label='Training')
ax.plot(plot_idx, boost_boston.train_score_,
'r', label='Test')
ax.plot(plot_idx, test_error, ; ax.legend()
-
위 그림에서 단계 진행에 따른 훈련과 평가 예측오차를 확인, 아래는 예측오차 계산
= boost_boston.predict(X_test)
y_hat_boost - y_hat_boost)**2) np.mean((y_test
13.461677803903655
-
학습률 조정에 따른 예측오차
= GBR(n_estimators=5000, learning_rate=0.2, max_depth=3,random_state=0)
boost_boston
boost_boston.fit(X_train, y_train)= boost_boston.predict(X_test);
y_hat_boost - y_hat_boost)**2) np.mean((y_test
14.501514553719565
5. 베이지안 의사결정나무
-
평가 데이터에서 예측오차 확인
= BART(random_state=0, burnin=5, ndraw=15)
bart_boston # burn-in: 앞의 몇 개를 잘라냄, ndraw: 몇 개의 연쇄 샘플링을 할지 경정
bart_boston.fit(X_train , y_train)
= bart_boston.predict(X_test.astype(np.float32))
yhat_test - yhat_test)**2) np.mean((y_test
22.145009458109225
-
변수의 중요도
= pd.Series(bart_boston.variable_inclusion_.mean(0), index=D.columns)
var_inclusion print(var_inclusion)
crim 26.933333
zn 27.866667
indus 26.466667
chas 22.466667
nox 26.600000
rm 29.800000
age 22.733333
dis 26.466667
rad 23.666667
tax 24.133333
ptratio 24.266667
lstat 31.000000
dtype: float64